Running the SOM clustering procedure with our given parameters.
#Code using kohonen package instead of sombrero
gridsize=20
num_clusters=8
set.seed(123)
#Uncomment the below lines to be able to generate a map de novo
# som.grid <- somgrid(xdim=gridsize,ydim=gridsize,topo="hexagonal",toroidal=TRUE)
# som.model <- som(metabolite_matrix,grid=som.grid,rlen=1500,alpha=c(0.025,0.01))
#Comment this line out if you plan to generate a map de novo
load("Data/final-kohonen-som.RData")
#This has been an issue but we need to set the specific seed present *after* running the original SOM model to be able to reproduce the exact results
.Random.seed <- good.seed
#Plot the training progress to see and also to ensure right parameters were used
# plot(som.model,type="changes")
carve.som <- som.model$codes[[1]]
som.dist <- as.matrix(dist(carve.som))
try.k <- 2:20
#Load the pre-saved cluster data
load("Data/final-som-clusters.RData")
k <- num_clusters
#There's a nebulous issue with defining seeds while generating a markdown file, so at the moment we are loading a pre-saved data object containing the relevant cluster definitions using the correct seeds. You can uncomment this if you wish to run the chunks directly.
# cluster.dist.eval <- as.data.frame(matrix(ncol = 3, nrow = (length(try.k))))
# colnames(cluster.dist.eval) <- c('k', 'kmeans', 'hclust')
#
# for(i in 1:length(try.k)) {
# cluster.dist.eval[i, 'k'] <- try.k[i]
# cluster.dist.eval[i, 'kmeans'] <- clusterMeanDist(kmeans(carve.som, centers = try.k[i], iter.max = 20)$cluster)
# cluster.dist.eval[i, 'hclust'] <- clusterMeanDist(cutree(hclust(vegdist(carve.som,method="euclidean")), k = try.k[i]))
# }
#Plot the kmeans distances as a function of the number of clusters for both clustering methodologies
plot(cluster.dist.eval[, 'kmeans'] ~ try.k,
type = 'l')
lines(cluster.dist.eval[, 'hclust'] ~ try.k,
col = 'red')
legend('topright',
legend = c('k-means', 'hierarchical'),
col = c('black', 'red'),
lty = c(1, 1))

# cluster.tries <- list()
#
# for(k in num_clusters){
#
# ## k-means clustering
#
# som.cluster.k <- kmeans(carve.som, centers = k, iter.max = 10000, nstart = 10)$cluster # k-means
#
# ## hierarchical clustering
#
# som.dist <- dist(carve.som) # hierarchical, step 1
# som.cluster.h <- cutree(hclust(som.dist), k = k) # hierarchical, step 2
#
# ## capture outputs
#
# cluster.tries[[paste0('som.cluster.k.', k)]] <- som.cluster.k
# cluster.tries[[paste0('som.cluster.h.', k)]] <- som.cluster.h
# }
obs_density <- data.frame(node=som.model$unit.classif)%>%group_by(node)%>%summarise(count=n())
# ggplot(obs_density,aes(x=count))+geom_histogram()+custom_theme
#Want to find the average variance and number of models represented in each cluster
clusters <- som.cluster.k
ids <- som.model$unit.classif
sample_clusters <- som.cluster.k[som.model$unit.classif]
na.nodes <- which(c(1:gridsize^2)%in%unique(obs_density$node)==FALSE)
som.cluster.k[na.nodes] <- NA
all_vars <- c()
for (i in 1:k){
curr_clust=i
curr_models <- which(clusters==curr_clust)
if (length(curr_models)>1){
curr_vars <- apply(metabolite_matrix[curr_models,],2,sd)%>%mean()
all_vars <- c(all_vars,curr_vars)
} else {
all_vars <- c(all_vars,0)
}
}
# var_df <- data.frame(clust_size=table(sample_clusters),mean_var=all_vars)
# colnames(var_df) <- c("cluster","num_models","mean_var")
# ggplot(var_df,aes(x=num_models,y=mean_var,color=cluster))+geom_point(size=5)+labs(color="Cluster",x="Number of models in cluster",y="Average variance in cluster")+scale_color_manual(values = c("#a0a052","#6c63d7","#a0b533","#c957ca","#5db855","#98459e","#d89d37","#7a93dd","#c9532a","#46aed7","#d6445a","#50b593","#d1438d","#4c7937","#c285de","#91682d","#625fa3","#dd8a68","#d883af","#a44d61"))
SOM cluster analysis… done!
## [1] "There were 578 genomes with duplicate growth sensitivity vectors across all models generated for that genome."
## [1] "MARD_SAMN05216575_REFG_MMP05216575 has 2 SOM clusters with 29 models each."
## [1] "MARD_SAMN07327725_REFG_MMP07327725 has 2 SOM grid points with 18 models each."
## [1] "MARD_SAMN08912472_REFG_MMP08912472 has 2 SOM grid points with 21 models each."
Tree generation… done!
Metabolite pattern analysis… done!
Phylogenetic analysis… done!
Growth estimate (dCUB) analysis… done!
## [1] "Results of non-parametric Kruskal-Wallis Test:"
##
## Kruskal-Wallis rank sum test
##
## data: kruskal.list
## Kruskal-Wallis chi-squared = 128.06, df = 7, p-value < 2.2e-16
Comparison to Matti’s data… done!
## [1] "Metabolite sarcosine has no matches!"
## [1] "Metabolite taurine has no matches!"
## [1] "Metabolite hydroxyproline has no matches!"
## [1] "Metabolite oxalacetate has no matches!"
## [1] "Metabolite valerate has no matches!"
## [1] "Metabolite malonate has no matches!"
## [1] "Metabolite 3m2-oxybutyrate has no matches!"
## [1] "Metabolite gluconate lactone has no matches!"
## [1] "Metabolite glucuronate lactone has no matches!"
## [1] "Metabolite glcnac has no matches!"
## [1] "Metabolite galnac has no matches!"
## [1] "Metabolite melibiose has no matches!"
## [1] "Metabolite lactulose has no matches!"
## [1] "Metabolite alpha-cyclodextrin has no matches!"
## [1] "Metabolite erythrose has no matches!"
Read recruitment analysis… done!


Table Generation